## 1. occupancy data
occupancy = read.csv("trains_train.csv")
## 2. station data
stations0 <- read.csv("stations.csv")
stations.sf <-
stations0 %>%
st_as_sf(coords = c("longitude", "latitude"), crs = 4326, agr = "constant") %>%
st_transform(31370)
stations <- stations.sf %>%
separate(URI, c("res", "station_name"), "NMBS/")%>%
select(station_name,name,country.code,avg_stop_times,
geometry)
### Check station distribution
belgium_basemap <- get_stamenmap(bbox = c(left = 2.739258, bottom = 49.563228,
right = 6.551513, top = 51.695260),
zoom = 11)
ggmap(belgium_basemap)+
geom_point(aes(x = longitude, y = latitude), data = stations0, size = 1, color ='darkred') +
labs(title="Train Station Distribution in Belgium\n",
caption = 'Figure 2.1',
x="Longtitude",
y="Latitude")+
plotTheme()## 3. Weather data
W701 <- read.csv("weather_data_july_1.csv")
W702 <- read.csv("weather_data_july_2.csv")
W801 <- read.csv("weather_data_aug_1.csv")
W802 <- read.csv("weather_data_aug_2.csv")
W901 <- read.csv("weather_data_sep_1.csv")
W902 <- read.csv("weather_data_sep_2.csv")
W101 <- read.csv("weather_data_oct_1.csv")
W102 <- read.csv("weather_data_oct_2.csv")
weather <- rbind(W701,W702,W801,W802,W901,W902,W101,W102) %>%
mutate(interval60 = ymd_hms(date_time))
## 4. Demographic data: population density
### population
### https://ec.europa.eu/eurostat/cache/metadata/en/demo_r_gind3_esms.htm
### topo data
### https://gisco-services.ec.europa.eu/distribution/v1/nuts-2016.html
bel <- st_read("NUTS_RG_60M_2016_4326_LEVL_3.shp") ## Reading layer `NUTS_RG_60M_2016_4326_LEVL_3' from data source `/Users/penguin/Box Sync/GitHub/Final Project/NUTS_RG_60M_2016_4326_LEVL_3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1522 features and 9 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -61.841 ymin: -21.376 xmax: 55.85 ymax: 71.178
## geographic CRS: WGS 84
popdens <- read.csv("demo_r_d2jan_1_Data.csv")
belpopden <- bel %>%
left_join(dplyr::select(popdens, c(GEO, Value)), by = c("FID" = "GEO")) %>%
st_transform(31370)
## 5. Facility data
facility <- read.csv("facilities.csv")
facility[is.na(facility)] <- 0
facility <-
facility %>%
mutate(facility_sum = select(.,ticket_vending_machine:audio_induction_loop) %>%
rowSums()) %>%
select(name,facility_sum)
stations <- merge(stations, facility,by.x="name",all.x=TRUE)
## 6. Railway data
### https://mapcruzin.com/free-belgium-arcgis-maps-shapefiles.htm
rail <- st_read("belgium-railways-shape/railways.shp") %>%
st_transform(4326)## Reading layer `railways' from data source `/Users/penguin/Box Sync/GitHub/Final Project/belgium-railways-shape/railways.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3231 features and 3 fields
## geometry type: LINESTRING
## dimension: XY
## bbox: xmin: 2.558077 ymin: 49.54788 xmax: 6.363977 ymax: 51.45972
## geographic CRS: WGS 84
## 7. Line Info
lines <- read.csv('line_info.csv')
# Initial Dataset
data <-
occupancy %>%
mutate(from = as.character(from), to = as.character(to)) %>%
left_join(dplyr::select(stations, station_name), by = c("from" = "station_name")) %>%
st_sf() %>%
mutate(from.X = st_coordinates(.)[,1], from.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
left_join(dplyr::select(stations, station_name), by = c("to" = "station_name")) %>%
st_sf() %>%
mutate(to.X = st_coordinates(.)[,1], to.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2)) %>%
arrange(-distance)
data <- data %>%
mutate(datetime <- paste(date,time),
datetime = parse_date_time(datetime, '%y-%m-%d %I:%M:%S %p'),
interval60 = floor_date(ymd_hms(datetime), unit = "hour"),
interval15 = floor_date(ymd_hms(datetime), unit = "15 mins"),
week = week(interval60),
dotw = wday(interval60)) %>%
select(-`datetime <- paste(date, time)`)
data <- data %>%
mutate(day = case_when(
dotw==1~"Mon",
dotw==2~"Tue",
dotw==3~"Wed",
dotw==4~"Thu",
dotw==5~"Fri",
dotw==6~"Sat",
dotw==7~"Sun"
)) %>%
select(-dotw)
data$vehicle_type <- gsub("[^a-zA-Z]", "", data$vehicle)
data$vehicle_type[data$vehicle_type==""] <- "undefined"
data$vehicle_type[data$vehicle_type=="ic"] <- "ICE"
## final data: departure station data
data_f_0 <- merge(data, stations, by.x = "from", by.y = "station_name",all.x = TRUE) %>%
st_as_sf()
data_f_00 <- merge(data_f_0,weather,by.x=c("name","interval60"),by.y=c("station_name","interval60"),all.x = TRUE)
## final data: destination station data
data_t_0 <- merge(data, stations, by.x = "to", by.y = "station_name",all.x = TRUE) %>%
st_as_sf()
data_t_00 <- merge(data_t_0,weather,by.x=c("name","interval60"),by.y=c("station_name","interval60"),all.x = TRUE)Calculate the distance of nearest three stations around each station and show the outcome by occupancy level.
Codes below calculate the distances of each station to the five major cities: Brussels,Liege,Charleroi, Gent, and Antwerpen measured by population size.
name <- c("Brussels","Liege","Charleroi", "Gent", "Antwerpen")
lat <- c(50.85045,50.63373, 50.41136, 51.05, 51.21989)
lon <- c(4.34878, 5.56749, 4.44448, 3.71667, 4.40346)
majorcities <- tibble(name, lat, lon) %>%
st_as_sf(coords = c("lon","lat"), crs = 4326, agr = "constant") %>%
st_transform(31370)
stations_copy$majc_nn5 <- nn_function(st_coordinates(stations), st_coordinates(majorcities), 5)
stations_copy <-
stations_copy %>%
mutate(Brussels = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[1,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[1,2])^2)) %>%
mutate(Liege = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[2,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[2,2])^2)) %>%
mutate(Charleroi = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[3,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[3,2])^2)) %>%
mutate(Gent = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[4,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[4,2])^2)) %>%
mutate(Antwerpen = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[5,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[5,2])^2)) Codes below are to calculate the proportion of each occupancy level for each origin station and destination station respectively.
## Origin Station
data_f_00_h <- data_f_00 %>%
filter(occupancy=='high')
data_f_00_m <- data_f_00 %>%
filter(occupancy=='medium')
data_f_00_l <- data_f_00 %>%
filter(occupancy=='low')
sta_from_total <- as.data.frame(table(data_f_00$from)) %>%
filter(Var1!='(null)') %>%
mutate(station_f=as.character(Var1)) %>%
mutate(F_total=Freq) %>%
select(3,4)
sta_from_h <- as.data.frame(table(data_f_00_h$from)) %>%
filter(Var1!='(null)') %>%
mutate(station_f=as.character(Var1)) %>%
mutate(F_H_freq=Freq) %>%
select(3,4)
sta_from_m <- as.data.frame(table(data_f_00_m$from)) %>%
filter(Var1!='(null)') %>%
mutate(station_f=as.character(Var1)) %>%
mutate(F_M_freq=Freq) %>%
select(3,4)
sta_from_l <- as.data.frame(table(data_f_00_l$from)) %>%
filter(Var1!='(null)') %>%
mutate(station_f=as.character(Var1)) %>%
mutate(F_L_freq = Freq) %>%
select(3,4)
## Destination Station
data_t_00_h <- data_t_00 %>%
filter(occupancy=='high')
data_t_00_m <- data_t_00 %>%
filter(occupancy=='medium')
data_t_00_l <- data_t_00 %>%
filter(occupancy=='low')
sta_to_total <- as.data.frame(table(data_t_00$to)) %>%
filter(Var1!='(null)') %>%
mutate(station_t = as.character(Var1)) %>%
mutate(T_total = Freq) %>%
select(3,4)
sta_to_h <- as.data.frame(table(data_t_00_h$to)) %>%
filter(Var1!='(null)') %>%
mutate(station_t = as.character(Var1)) %>%
mutate(T_H_freq=Freq) %>%
select(3,4)
sta_to_m <- as.data.frame(table(data_t_00_m$to)) %>%
filter(Var1!='(null)') %>%
mutate(station_t=as.character(Var1)) %>%
mutate(T_M_freq=Freq) %>%
select(3,4)
sta_to_l <- as.data.frame(table(data_t_00_l$to)) %>%
filter(Var1!='(null)') %>%
mutate(station_t=as.character(Var1)) %>%
mutate(T_L_freq = Freq) %>%
select(3,4)
stations_occ <- merge(stations,sta_from_total,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_h,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_m,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_l,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_to_total,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_h,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_m,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_l,by.x="station_name",by.y='station_t',all.x=T)
stations_occ$F_H_com <- stations_occ$F_H_freq/stations_occ$F_total
stations_occ$F_M_com <- stations_occ$F_M_freq/stations_occ$F_total
stations_occ$F_L_com <- stations_occ$F_L_freq/stations_occ$F_total
stations_occ$T_H_com <- stations_occ$T_H_freq/stations_occ$T_total
stations_occ$T_M_com <- stations_occ$T_M_freq/stations_occ$T_total
stations_occ$T_L_com <- stations_occ$T_L_freq/stations_occ$T_total
stations_occq <- select(stations_occ,1,14,15,16,17,18,19,20)Codes below calculate the average occupancy level proportion of the nearest three stations of each stations.
neighborList <- knn2nb(knearneigh(st_coordinates(stations), 3))
occupancy$from <- as.character(occupancy$from)
occupancy$to <- as.character(occupancy$to)
getnb_from <- function(x,i, from, occ){
nbcount = occupancy %>%
left_join(dplyr::select(stations_copy, c(station_name, stationsdist)), by = c(from = "station_name")) %>%
filter(from %in% stations$station_name[neighborList[[i]]]) %>%
tally()
nbcount1 = occupancy %>%
left_join(dplyr::select(stations_copy, c(station_name, stationsdist)), by = c(from = "station_name")) %>%
filter(from %in% stations$station_name[neighborList[[i]]] & occupancy == occ) %>%
tally()
nbcount2 = nbcount1/nbcount
return(nbcount2[,1])
}
#avg composition of low occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"from","low")
}
stations_copy$nb_occ_f_low <- nblist
#avg composition of medium occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"from","medium")
}
stations_copy$nb_occ_f_medium <- nblist
#avg composition of high occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"from","high")
}
stations_copy$nb_occ_f_high <- nblist
#avg composition of low occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"to","low")
}
stations_copy$nb_occ_t_low <- nblist
#avg composition of medium occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"to","medium")
}
stations_copy$nb_occ_t_medium <- nblist
#avg composition of high occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
nblist[[i]] = getnb_from(neighborList,i,"to","high")
}
stations_copy$nb_occ_t_high <- nblistThen we explore the popularity of each station, in other words, we can calculate the number of lines connected to each origin station and destination station respectively.
connect <- function(line,stt){
stt <- as.data.frame(stt) %>%
select(1)
stt$connect <- NA
for(i in seq(1:nrow(stt))){
if(length(table(line$stops==stt[i,1]))==2){
line_i <- line %>%
filter(stops==stt[i,1])
stt$connect[i] <- nrow(line_i)
}
}
return(stt)
}
lines_connect <- lines %>%
select(5,7) %>%
mutate(stops = gsub("\\[","",stopping_station_ids)) %>%
mutate(stops = gsub("\\]","",stops)) %>%
mutate(stops = gsub("'", "",stops)) %>%
mutate(stops = gsub(" ", "",stops)) %>%
unique() %>%
select(3) %>%
separate_rows(stops, sep=",") %>%
group_by(stops) %>%
summarize(connections = n())
stations_occ_line <- merge(stations_occq, lines_connect,
by.x = "station_name", by.y = "stops", all.x = TRUE)
stations_final <- merge(st_drop_geometry(stations_copy),
st_drop_geometry(stations_occ_line),
by = "station_name") %>%
select(-name,-country.code,-avg_stop_times,-facility_sum)Creating time lag variables will add additional nuance about the demand during a given time period. Since the period of given data barely covers holidays, thus this analysis creates lagHour, lag2Hours,lag3Hours, lag4Hours, and lag12Hours to show occupancy level of each shift before and after a certain time whth holiday effect ruled out.
ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = stationsdist) ,size = 0.5)+
labs(title="Station Density by Occupancy Level",
subtitle = 'Distance of each station to the nearest three stations\n',
caption = 'Figure')+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "plasma",
name = 'Counts')+
facet_wrap(~occupancy, ncol = 3) +
mapTheme()+
plotTheme() +
theme(legend.position = "bottom")ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = majc_nn5) ,size = 0.5)+
labs(title="Distance of Stations to Major Cities",
subtitle = ' ',
caption = 'Figure')+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "plasma",
name = 'Counts')+
facet_wrap(~occupancy, ncol = 3) +
mapTheme()+
plotTheme() ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = F_H_com) ,size = 0.5)+
labs(title="High Occupancy Level Proportion of Each Station",
subtitle = ' ',
caption = 'Figure')+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "plasma",
name = 'Ratio')+
mapTheme()+
plotTheme() ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = connections) ,size = 0.5)+
labs(title="Station Popularity across Nation",
subtitle = 'Counts of connections of each station\n',
caption = 'Figure')+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "plasma",
name = 'Counts')+
mapTheme()+
plotTheme() ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = facility_sum) ,size = 0.5)+
labs(title="Facility Counts across Stations",
subtitle = 'Departure Station\n',
caption = 'Figure')+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "plasma",
name = 'Counts')+
mapTheme()+
plotTheme() grid.arrange(
ggplot(data_f, aes(interval60,humidity)) + geom_line(color = "#6897BB") +
labs(title="Weather characteristics: Percipitation",
x="Hour", y="Perecipitation") +
plotTheme(),
ggplot(data_f, aes(interval60,windspeed)) + geom_line(color = "#6897BB") +
labs(title="Weather characteristics: Wind Speed",
x="Hour", y="Wind Speed") +
plotTheme(),
ggplot(data_f, aes(interval60,temperature)) + geom_line(color = "#6897BB") +
labs(title="Weather characteristics: Temperature",
x="Hour", y="temperature",
caption="Figure 3.1") +
plotTheme())ggplot(data_f %>%
mutate(hour = hour(datetime)))+
geom_freqpoly(aes(hour, color = occupancy), binwidth = 1)+
scale_colour_manual(values = palette3) +
labs(title="Train Shift Counts by Occupancy Level (General)",
subtitle = 'Belgium, 2016\n',
caption = "Figure",
x="Hour of the Day",
y="Counts")+
plotTheme()ggplot(data_f %>% mutate(hour = hour(datetime),
weekend = ifelse(day %in% c("Sun", "Sat"), "Weekend", "Weekday"))) +
geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
scale_color_manual(values = palette2,
name = 'Period') +
labs(title="Train Shift Counts by Occupancy Level (Week Periods)",
subtitle = 'Belgium, 2016\n',
caption = 'Figure ',
x="Hour of the Day",
y="Counts")+
facet_wrap(~occupancy,ncol=3) +
plotTheme()ggplot(data_f %>% mutate(hour = hour(datetime))) +
geom_freqpoly(aes(hour, color = day), binwidth = 1)+
scale_color_manual(values = palette7,
name = 'Week Day') +
labs(title="Train Shift Counts by Occupancy Level (Week Days)",
subtitle = 'Belgium, 2016\n',
caption = 'Figure ',
x="Hour of the Day",
y="Counts")+
facet_wrap(~occupancy,ncol=3) +
plotTheme()test04 <- data %>%
group_by(vehicle) %>%
summarise(n = n()) %>%
filter(n >= 12)
freq_v = c("IC1518","IC429","IC1515","IC407","P7305","IC1807","IC3631","1828","8015","IC1831","IC539","P7444","S83978", "IC1507","IC716","L557","S23665","IC3432","IC4317","S53586")
p1 <- ggplot() +
geom_sf(data = rail, aes(color = "grey")) +
geom_sf(data=subset(data_f, data_f$vehicle %in% freq_v),aes(colour = vehicle),size=2,show.legend = "point")+
labs(title="Top 20 Busiest Lines",
subtitle = 'Departure Stations\n',
caption = "Figure")+
mapTheme()+
plotTheme() +
theme(legend.position = "bottom") +
transition_manual(factor(vehicle, levels = freq_v), cumulative = TRUE)
gganimate::animate(p1, duration=15,renderer = gifski_renderer())
## occupancy by lines: destination
p2 <- ggplot()+
geom_sf(data=rail,aes(color = "grey"))+
geom_sf(data=subset(data_t, data_t$vehicle %in% freq_v),aes(colour = vehicle),size=2,show.legend = "point")+
labs(title="Top 20 Frequent Lines Plotted to Destination\n",
caption = 'Figure')+
mapTheme()+
plotTheme() +
theme(legend.position = "bottom") +
transition_manual(factor(vehicle, levels = freq_v), cumulative = TRUE)
gganimate::animate(p2, duration=15,renderer = gifski_renderer())ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_f %>%
filter(country.code == 'be'),
aes(color = occupancy) ,size = 0.5)+
labs(title="Occupancy Level across Departure Station by Week Days\n",
caption = 'Figure')+
facet_wrap(~day,nrow =2) +
scale_colour_manual(values = palette3,
name = 'Occupancy') +
mapTheme()+
plotTheme() ggplot() +
geom_sf(data = rail, color = "grey") +
geom_sf(data = data_t %>%
filter(country.code == 'be'),
aes(color = occupancy) ,size = 0.5)+
labs(title="Occupancy Level across Destination Station by Week Days\n",
caption = 'Figure')+
facet_wrap(~day,nrow =2) +
scale_colour_manual(values = palette3,
name = 'Occupancy') +
mapTheme()+
plotTheme() ggplot(data_f %>%
mutate(hour = hour(datetime)) %>%
filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"),
aes(temperature, color = day))+
geom_freqpoly(binwidth = 1)+
facet_wrap(~occupancy,ncol=3) +
scale_colour_manual(values = palette7,
name = "Week Day") +
labs(title="Train Shift Counts across Temperature by Occupancy Level",
subtitle = "Belgium, 2016\n",
caption = "Figure ",
x="Temperature",
y="Counts") +
plotTheme()ggplot(data_f %>%
mutate(hour = hour(datetime)) %>%
filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"),
aes(humidity, color = day))+
geom_freqpoly(binwidth = 1)+
facet_wrap(~occupancy,ncol=3) +
scale_colour_manual(values = palette7,
name = "Week Day") +
labs(title="Train Shift Counts across Humidity by Occupancy Level",
subtitle = "Belgium, 2016\n",
caption = "Figure ",
x="Humidity",
y="Counts") +
plotTheme()ggplot(data_f %>%
mutate(hour = hour(datetime)) %>%
filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"),
aes(windspeed, color = day))+
geom_freqpoly(binwidth = 1)+
facet_wrap(~occupancy,ncol=3) +
scale_colour_manual(values = palette7,
name = "Week Day") +
labs(title="Train Shift Counts across Windspeed by Occupancy Level",
subtitle = "Belgium, 2016\n",
caption = "Figure ",
x="Windspeed",
y="Counts") +
plotTheme()ggplot(data_f %>%
mutate(hour = hour(datetime)) %>%
filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"),
aes(visibility, color = day))+
geom_freqpoly(binwidth = 1)+
facet_wrap(~occupancy,ncol=3) +
scale_colour_manual(values = palette7,
name = "Week Day") +
labs(title="Train Shift Counts across Visibility by Occupancy Level",
subtitle = "Belgium, 2016\n",
caption = "Figure ",
x="Visibility",
y="Counts") +
plotTheme()